In [126]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import itertools
import spacy
import nltk
%matplotlib inline
In [127]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
In [128]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
In [129]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
In [130]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
In [131]:
# dataframe display options
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 200)
The Objective of the project is to use Natural Language Processing on two fields entered by the user ("event title" and "additional info") to guide the user into selecting the correct category.
- "Event type" (category) is a key field used by the company to match events and the capabilities of their pool of photographers.
In order to validate the possibility of a project I scraped information on 100 events. Links to the events were provided by the company.
In [699]:
def search_titles(df, expression):
categories = df[df['category'].str.contains(expression, regex=True) == True]['category']
idxs = df[df['category'].str.contains(expression, regex=True) == True].index
print(categories)
return idxs
def update_category(df, indices, category_title):
for i in indices:
df.loc[i, 'category'] = category_title
In [702]:
b = search_titles(g, r'baptism')
In [125]:
update_category(g, b, 'life celebration')
In [150]:
g['category'].value_counts()[:20][::-1].plot(kind='barh');
plt.title("Top 20 categories");
In [728]:
## Pickle data after cleanup
with open('events.pickle', 'wb') as f:
pickle.dump(g, f)
In [201]:
reset -fs
In [8]:
with open('events.pickle', 'rb') as f:
g = pickle.load(f)
In [9]:
categories = g.category.unique()
print("Number of categories: {}".format(len(categories)))
In [10]:
g['category'].value_counts()[:20][::-1].plot(kind='barh');
plt.title("Top Categories");
In [11]:
g['category'].value_counts()[:20]
Out[11]:
In [12]:
g.tail()
Out[12]:
In [13]:
length = g[['title', 'additional_info', 'category']].copy()
In [14]:
length['title'][0]
Out[14]:
In [53]:
length['title_length'] = length['title'].map(lambda text: len(str(text)))
length.head()
Out[53]:
In [54]:
length['additional_info_length'] = length['additional_info'].map(lambda text: len(str(text)))
length.head()
Out[54]:
In [55]:
length.title_length.plot(bins=20, kind='hist');
plt.title("Title - Character length");
In [56]:
length.additional_info_length.plot(bins=15, kind='hist');
plt.title("Additional Info - Character length");
We can see that "additional info" is an optional field and that overall length of each of the samples is small.
In [57]:
length[length["category"] == 'kids birthday']["title_length"].plot(bins=20, kind='hist');
plt.title("KIDS BIRTHDAY - Title - Character length");